Step-1 : Data Collection | Step-2 : Data Pre Processing | Step-3 : EDA | Step-4 : Conclusions
Performing Data Collection by using Web Scraping
BeautifulSoup – Python library for getting data out of HTML, XML, and other markup languages.
Command to install beautifulsoup4! : ! pip install beautifulsoup4
from bs4 import BeautifulSoup as soup
from datetime import date, datetime
from urllib.request import Request, urlopen
import pandas as pd # Pandas – Python library for data manipulation and analysis.
import numpy as np # NumPy is a Python library used for working with arrays. It also has functions for working in domain of linear algebra, fourier transform, and matrices.
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import seaborn as sns
import gc
import warnings
warnings.filterwarnings("ignore")
# ! pip install pandas_profiling
from pandas_profiling import ProfileReport
today = datetime.now()
yesterday_str = "%s %d,%d" %(date.today().strftime("%b"), today.day-1, today.year)
Web Scraping :
url = "https://www.worldometers.info/coronavirus/#countries"
req = Request(url , headers={'User-Agent':"Mozilla/5.0"})
webpage = urlopen(req)
page_soup = soup(webpage, "html.parser")
table = page_soup.findAll("table",{"id":"main_table_countries_yesterday"})
containers = table[0].findAll("tr",{"style":""})
title = containers[0]
del containers[0]
all_data = []
clean = True
for country in containers:
country_data = []
country_container = country.findAll("td")
if country_container[1].text == "China":
continue
for i in range(1, len(country_container)):
final_feature = country_container[i].text
if clean :
if i !=1 and i != len(country_container)-1:
final_feature = final_feature.replace(",","")
if final_feature.find('+') != -1:
final_feature = final_feature.replace("+","")
final_feature = float(final_feature)
elif final_feature.find('-') != -1:
final_feature = final_feature.replace("-","")
final_feature = float(final_feature)
if final_feature == 'N/A':
final_feature = 0
elif final_feature == "" or final_feature == " ":
final_feature = -1
country_data.append(final_feature)
all_data.append(country_data)
Performing Data Pre Processing by using Pandas
df = pd.DataFrame(all_data)
df.drop([15,16,17,18,19,20], inplace = True, axis = 1)
df.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | World | 314023118 | 2846498.0 | 5521223 | 8265.0 | 261713993 | 975709.0 | 46787902 | 95534 | 40286 | 708.3 | -1 | -1 | -1 | All |
| 1 | USA | 63390876 | 672872.0 | 863896 | 2173.0 | 42641852 | 136478.0 | 19885128 | 24368 | 189812 | 2587 | 844714275 | 2529338 | 333966543 | North America |
| 2 | India | 36070510 | 194720.0 | 484655 | 442.0 | 34630536 | 60405.0 | 955319 | 8944 | 25751 | 346 | 693155280 | 494857 | 1400717076 | Asia |
| 3 | Brazil | 22630142 | 71447.0 | 620281 | 139.0 | 21626836 | -1.0 | 383025 | 8318 | 105322 | 2887 | 63776166 | 296818 | 214866497 | South America |
| 4 | UK | 14732551 | 120806.0 | 150609 | 379.0 | 10945874 | 87362.0 | 3636068 | 820 | 215292 | 2201 | 425464553 | 6217464 | 68430567 | Europe |
column_labels = ["Country", "Total Cases", "New Cases", "Total Deaths", "New Deaths", "Total Recovered", "New Recovered",
"Active Cases","Serious/Critical", "Total Cases/1M", "Deaths/1M", "Total Tests", "Test/1M", "Population",
"Continent"]
df.columns = column_labels
df.head()
| Country | Total Cases | New Cases | Total Deaths | New Deaths | Total Recovered | New Recovered | Active Cases | Serious/Critical | Total Cases/1M | Deaths/1M | Total Tests | Test/1M | Population | Continent | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | World | 314023118 | 2846498.0 | 5521223 | 8265.0 | 261713993 | 975709.0 | 46787902 | 95534 | 40286 | 708.3 | -1 | -1 | -1 | All |
| 1 | USA | 63390876 | 672872.0 | 863896 | 2173.0 | 42641852 | 136478.0 | 19885128 | 24368 | 189812 | 2587 | 844714275 | 2529338 | 333966543 | North America |
| 2 | India | 36070510 | 194720.0 | 484655 | 442.0 | 34630536 | 60405.0 | 955319 | 8944 | 25751 | 346 | 693155280 | 494857 | 1400717076 | Asia |
| 3 | Brazil | 22630142 | 71447.0 | 620281 | 139.0 | 21626836 | -1.0 | 383025 | 8318 | 105322 | 2887 | 63776166 | 296818 | 214866497 | South America |
| 4 | UK | 14732551 | 120806.0 | 150609 | 379.0 | 10945874 | 87362.0 | 3636068 | 820 | 215292 | 2201 | 425464553 | 6217464 | 68430567 | Europe |
for label in df.columns:
if label != 'Country' and label != 'Continent':
df[label] = pd.to_numeric(df[label])
df["%Increase Cases"] = df["New Cases"]/df["Total Cases"]*100
df["%Increase Deaths"] = df["New Deaths"]/df["Total Deaths"]*100
df["%Increase Recovered"] = df["New Recovered"]/df["Total Recovered"]*100
df.head()
| Country | Total Cases | New Cases | Total Deaths | New Deaths | Total Recovered | New Recovered | Active Cases | Serious/Critical | Total Cases/1M | Deaths/1M | Total Tests | Test/1M | Population | Continent | %Increase Cases | %Increase Deaths | %Increase Recovered | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | World | 314023118 | 2846498.0 | 5521223 | 8265.0 | 261713993 | 975709.0 | 46787902 | 95534 | 40286.0 | 708.3 | -1 | -1 | -1 | All | 0.906461 | 0.149695 | 0.372815 |
| 1 | USA | 63390876 | 672872.0 | 863896 | 2173.0 | 42641852 | 136478.0 | 19885128 | 24368 | 189812.0 | 2587.0 | 844714275 | 2529338 | 333966543 | North America | 1.061465 | 0.251535 | 0.320056 |
| 2 | India | 36070510 | 194720.0 | 484655 | 442.0 | 34630536 | 60405.0 | 955319 | 8944 | 25751.0 | 346.0 | 693155280 | 494857 | 1400717076 | Asia | 0.539832 | 0.091199 | 0.174427 |
| 3 | Brazil | 22630142 | 71447.0 | 620281 | 139.0 | 21626836 | -1.0 | 383025 | 8318 | 105322.0 | 2887.0 | 63776166 | 296818 | 214866497 | South America | 0.315716 | 0.022409 | -0.000005 |
| 4 | UK | 14732551 | 120806.0 | 150609 | 379.0 | 10945874 | 87362.0 | 3636068 | 820 | 215292.0 | 2201.0 | 425464553 | 6217464 | 68430567 | Europe | 0.819994 | 0.251645 | 0.798127 |
Performing EDA - Exploratory Data Analysis
cases = df[["Total Recovered", "Active Cases", "Total Deaths"]].loc[0]
cases_df = pd.DataFrame(cases).reset_index()
cases_df.columns = ["Type", "Total"]
cases_df['Percentage'] = np.round(100*cases_df['Total']/np.sum(cases_df["Total"]),2)
cases_df["virus"] = ["COVID-19" for i in range(len(cases_df))]
fig = px.bar(cases_df, x = "virus", y = "Percentage", color = "Type", hover_data=["Total"])
fig.show()
cases = df[["New Cases", "New Recovered", "New Deaths"]].loc[0]
cases_df = pd.DataFrame(cases).reset_index()
cases_df.columns = ["Type", "Total"]
cases_df['Percentage'] = np.round(100*cases_df['Total']/np.sum(cases_df["Total"]),2)
cases_df["virus"] = ["COVID-19" for i in range(len(cases_df))]
fig = px.bar(cases_df, x = "virus", y = "Percentage", color = "Type", hover_data=["Total"])
fig.show()
per = np.round(df[["%Increase Cases","%Increase Deaths","%Increase Recovered"]].loc[0], 2)
per_df = pd.DataFrame(per)
per_df.columns = ["Percentage"]
fig = go.Figure()
fig.add_trace(go.Bar(x = per_df.index, y = per_df['Percentage'], marker_color = ["yellow","blue","red"]))
fig.show()
continent_df = df.groupby("Continent").sum().drop("All")
continent_df = continent_df.reset_index()
continent_df
| Continent | Total Cases | New Cases | Total Deaths | New Deaths | Total Recovered | New Recovered | Active Cases | Serious/Critical | Total Cases/1M | Deaths/1M | Total Tests | Test/1M | Population | %Increase Cases | %Increase Deaths | %Increase Recovered | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Africa | 10297066 | 55765.0 | 232666 | 311.0 | 9069295 | 37252.0 | 957952 | 2619 | 1274247.0 | 16115.0 | 89721251 | 9204171 | 1389530855 | 41.944479 | -89.502570 | 18.496748 |
| 1 | Asia | 87584406 | 420665.0 | 1261504 | 1321.0 | 83054584 | 136997.0 | 3268317 | 26819 | 2661373.0 | 26348.0 | 1577782939 | 68135869 | 3221832309 | 18.360696 | 69.758727 | 3.767739 |
| 2 | Australia/Oceania | 1312852 | 93605.0 | 4697 | 23.0 | 541659 | 436.0 | 720416 | 372 | 369237.0 | 4807.0 | 64142257 | 7172573 | 42440133 | 4.092049 | 284.890159 | -37.259427 |
| 3 | Europe | 98616120 | 1254716.0 | 1557956 | 3805.0 | 78825759 | 534420.0 | 18232405 | 21172 | 7662960.0 | 95453.0 | 1780702856 | 152667527 | 748318872 | 55.400244 | 6.392763 | 42.240972 |
| 4 | North America | 74497342 | 742329.0 | 1264377 | 2373.0 | 51861783 | 197533.0 | 21364100 | 30516 | 3878852.0 | 42524.0 | 946509258 | 63987294 | 596156946 | 61.320722 | -108.649198 | 58.165758 |
| 5 | South America | 41610602 | 279190.0 | 1195365 | 348.0 | 36501280 | 68878.0 | 1704451 | 13956 | 1201344.0 | 28323.0 | 190670472 | 10649198 | 436186051 | 11.437395 | 100.310135 | 3.054274 |
def continent_visualization(v_list):
for label in v_list:
c_df = continent_df[['Continent', label]]
c_df['Percentage'] = np.round(100*c_df[label]/np.sum(c_df[label]), 2)
c_df['Virus'] = ['COVID-19' for i in range(len(c_df))]
fig = px.bar(c_df, x = "Virus", y = "Percentage", color = "Continent", hover_data=[label])
fig.update_layout(title = {'text' : label})
fig.show()
gc.collect()
cases_list = ["Total Cases", "Active Cases", "New Cases", "Serious/Critical", "Total Cases/1M"]
deaths_list = ["Total Deaths","New Deaths","Deaths/1M"]
recovered_list = ["Total Recovered", "New Recovered", "%Increase Recovered"]
continent_visualization(cases_list)
continent_visualization(deaths_list)
continent_visualization(recovered_list)
df = df.drop([len(df)-1])
country_df = df.drop([0])
country_df
| Country | Total Cases | New Cases | Total Deaths | New Deaths | Total Recovered | New Recovered | Active Cases | Serious/Critical | Total Cases/1M | Deaths/1M | Total Tests | Test/1M | Population | Continent | %Increase Cases | %Increase Deaths | %Increase Recovered | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | USA | 63390876 | 672872.0 | 863896 | 2173.0 | 42641852 | 136478.0 | 19885128 | 24368 | 189812.0 | 2587.0 | 844714275 | 2529338 | 333966543 | North America | 1.061465 | 0.251535 | 0.320056 |
| 2 | India | 36070510 | 194720.0 | 484655 | 442.0 | 34630536 | 60405.0 | 955319 | 8944 | 25751.0 | 346.0 | 693155280 | 494857 | 1400717076 | Asia | 0.539832 | 0.091199 | 0.174427 |
| 3 | Brazil | 22630142 | 71447.0 | 620281 | 139.0 | 21626836 | -1.0 | 383025 | 8318 | 105322.0 | 2887.0 | 63776166 | 296818 | 214866497 | South America | 0.315716 | 0.022409 | -0.000005 |
| 4 | UK | 14732551 | 120806.0 | 150609 | 379.0 | 10945874 | 87362.0 | 3636068 | 820 | 215292.0 | 2201.0 | 425464553 | 6217464 | 68430567 | Europe | 0.819994 | 0.251645 | 0.798127 |
| 5 | France | 12573263 | 368149.0 | 126059 | 341.0 | 8672310 | 83992.0 | 3774894 | 3333 | 191975.0 | 1925.0 | 188795159 | 2882628 | 65494103 | Europe | 2.928031 | 0.270508 | 0.968508 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 211 | Macao | 79 | -1.0 | -1 | -1.0 | 77 | -1.0 | 2 | -1 | 119.0 | -1.0 | 5075 | 7656 | 662844 | Asia | -1.265823 | 100.000000 | -1.298701 |
| 212 | Palau | 32 | 4.0 | -1 | -1.0 | 12 | -1.0 | 20 | -1 | 1756.0 | -1.0 | 18788 | 1030835 | 18226 | Australia/Oceania | 12.500000 | 100.000000 | -8.333333 |
| 213 | Solomon Islands | 25 | -1.0 | -1 | -1.0 | 20 | -1.0 | 5 | -1 | 35.0 | -1.0 | 4500 | 6315 | 712566 | Australia/Oceania | -4.000000 | 100.000000 | -5.000000 |
| 214 | Western Sahara | 10 | -1.0 | 1 | -1.0 | 8 | -1.0 | 1 | -1 | 16.0 | 2.0 | -1 | -1 | 619731 | Africa | -10.000000 | -100.000000 | -12.500000 |
| 215 | Marshall Islands | 7 | -1.0 | -1 | -1.0 | 4 | -1.0 | 3 | -1 | 117.0 | -1.0 | -1 | -1 | 59800 | Australia/Oceania | -14.285714 | 100.000000 | -25.000000 |
215 rows × 18 columns
Conclusions : Top 5 Countries Covid-19 trends
LOOK_AT = 5
country = country_df.columns[1:14]
fig = go.Figure()
c=0
for i in country_df.index:
if c < LOOK_AT:
fig.add_trace(go.Bar(name = country_df['Country'][i], x = country, y = country_df.loc[i][1:14]))
else :
break
c += 1
fig.update_layout(title = {"text":f'top {LOOK_AT} countries affected '}, yaxis_type = "log")
fig.show()